INT104 coursework 2 (student id: 1931391)¶

In [1]:
# Install all related packages which will be Used

import time
import pandas as pd
import seaborn as sns
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from pandas.plotting import scatter_matrix
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsClassifier

Step 1: dimensionality reduction (PCA algorithm)¶

In [2]:
def pca_and_visualize(input_file, output_file, unit_index, label):
    """
    :param input_file: str, input data file path
    :param output_file: str, output file path
    :param unit_index: str, column name of unit index
    :param label: str, column name of label
    """

    # Read the raw data.csv file
    df = pd.read_csv(input_file)

    # Delete the Label=2 value in dataframe
    df = df[df['Label'] != 2]

    # Separate the data frame of Patient index and multiple feature columns
    df_features = df.drop([label], axis=1)

    # Separate the data frame with Patient index and Label
    df_labels = df[[unit_index, label]]

    # Perform PCA dimensionality reduction on the feature columns
    pca = PCA(n_components=10)
    pca_result = pca.fit_transform(df_features.iloc[:, 1:].values)

    # Build a new data frame
    df_pca = pd.DataFrame(pca_result, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])
    df_pca.insert(0, unit_index, df_features[unit_index])

    # Align and merge the second data frame with the new data frame obtained in step three using a common Patient index
    df_new = pd.merge(df_pca, df_labels, on=unit_index)

    # Save PCA result to CSV file
    df_new.to_csv(output_file, index=False)

    # Draw scatter plot (PC1, PC2, PC3)
    fig = plt.figure(figsize=(10, 10))  # Set the size of the Figure
    ax = fig.add_subplot(projection='3d')
    groups = df_new.groupby(label)
    for name, group in groups:
        ax.scatter(group['PC1'], group['PC2'], group['PC3'], label=name)
    ax.legend()
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')
    ax.set_zlabel('PC3')
    plt.title('PCA')
    plt.subplots_adjust(left=0.1)  # Adjust left margin whitespace
    plt.show()
    fig.savefig('PCA10_scatter_plot.png')

    # Draw scatter matrix
    scatter_matrix(df_new.iloc[:, :-1], c=df_new[label], figsize=(20, 20), marker='o')
    plt.show()
    fig.savefig('PCA10_scatter_matrix.png')

    # Draw heatmap
    plt.figure(figsize=(15, 15))
    ax = plt.subplot()
    sns.heatmap(df_new.corr(), annot=True, ax=ax)
    plt.show()
    fig.savefig('PCA10_heatmap.png')

    # Test model performance
    print('Variance contribution rates (10):',
          sum(pca.explained_variance_ratio_))

    # Additionally, calculate variance contribution rates of the first 2, 3, 6 principal components
    pca_2 = PCA(n_components=2)
    pca_3 = PCA(n_components=3)
    pca_6 = PCA(n_components=6)
    pca_2.fit(df_features.iloc[:, 1:].values)
    pca_3.fit(df_features.iloc[:, 1:].values)
    pca_6.fit(df_features.iloc[:, 1:].values)
    print('Variance contribution rates (2):',
          sum(pca_2.explained_variance_ratio_))
    print('Variance contribution rates (3):',
          sum(pca_3.explained_variance_ratio_))
    print('Variance contribution rates (6):',
          sum(pca_6.explained_variance_ratio_))

    # Fit PCA model with cross-validation
    print('Cross validate scores:',
          cross_validate(pca, df, cv=10))


if __name__ == '__main__':
    start_time = time.time()
    pca_and_visualize('Data.csv', 'PCA10_data.csv', 'Patient index', 'Label')
    end_time = time.time()
    print('Execution time: {:.2f}s'.format(end_time - start_time))
Variance contribution rates (10): 0.8294108531299189
Variance contribution rates (2): 0.32479218230390283
Variance contribution rates (3): 0.41037573186770915
Variance contribution rates (6): 0.6119554104378722
Cross validate scores: {'fit_time': array([0.01495981, 0.01193666, 0.01196861, 0.01097131, 0.01296568,
       0.00997376, 0.01097035, 0.01401305, 0.00997138, 0.0119679 ]), 'score_time': array([0.00099778, 0.00099754, 0.00099683, 0.0019958 , 0.00199389,
       0.00199437, 0.00199485, 0.0009973 , 0.0009973 , 0.00199437]), 'test_score': array([-19.9893636 , -18.25827548, -17.671087  , -17.41411827,
       -17.77187038, -18.07353759, -18.5401708 , -18.59057024,
       -18.80471495, -19.59787247])}
Execution time: 11.55s

Step 2 [1]: training classifiers in a supervised way (K-Nearest Neighbors algorithm)¶

In [3]:
def knn_classification(input_file, output_file, unit_index, label, k_neighbors):
    """
    :param input_file: str, input data file path
    :param output_file: str, output file path
    :param unit_index: str, column name of unit index
    :param label: str, column name of label
    :param k_neighbors: int, k neighbors nearby
    """

    # Read the input file
    df = pd.read_csv(input_file)

    # Separate the data frame of Patient index, PCs columns
    df_features = df.drop([unit_index, label], axis=1)

    # Separate the data frame with Patient index and Label
    df_labels = df[[unit_index, label]]

    # Fit KNN model
    knn = KNeighborsClassifier(n_neighbors=k_neighbors)
    knn.fit(df_features, df_labels[label])

    # Predict labels
    predicted_labels = knn.predict(df_features)
    df_labels['Predicted Label'] = predicted_labels

    # Export the new data frame with predicted labels as output file
    df_labels.to_csv(output_file, index=False)

    # Compute the confusion matrix
    tp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 1)).sum()
    tn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 0)).sum()
    fp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 0)).sum()
    fn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 1)).sum()

    # Compute the accuracy, precision, and recall
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1_score = 2 * (precision * recall / (precision + recall))

    # Draw KNN scatter plot with predicted labels
    plt.figure(figsize=(10, 8))
    fig, ax = plt.subplots()
    groups = df_labels.groupby('Predicted Label')
    for name, group in groups:
        ax.scatter(df.loc[group.index, 'PC1'], df.loc[group.index, 'PC2'], label=name)
    ax.legend()
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('KNN Classification')
    plt.show()

    # Compute confusion matrix
    cm = confusion_matrix(df_labels[label], df_labels['Predicted Label'])

    # Plot confusion matrix using seaborn heatmap
    plt.figure(figsize=(5, 5))
    sns.heatmap(cm, annot=True, cmap='Blues')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()

    # print out the accuracy, precision, recall, F1 score of KNN classifier model
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 score:', F1_score)

    # Fit KNN model with cross-validation
    scores = cross_validate(knn, df_features, df_labels[label], cv=10)
    print("Cross validate scores:", scores)


if __name__ == '__main__':
    start_time = time.time()
    knn_classification('PCA10_data.csv', 'new_knn_data.csv', 'Patient index', 'Label', 47)
    end_time = time.time()
    print('Execution time: {:.2f}s'.format(end_time - start_time))
C:\Users\Scort\AppData\Local\Temp\ipykernel_38696\1958426232.py:25: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labels['Predicted Label'] = predicted_labels
<Figure size 1000x800 with 0 Axes>
Accuracy: 0.6243416102332581
Precision: 0.6045510455104551
Recall: 0.4206247325631151
F1 score: 0.49608882159979817
Cross validate scores: {'fit_time': array([0.00897574, 0.00997281, 0.0129652 , 0.00897503, 0.00996447,
       0.00997376, 0.00997138, 0.00997496, 0.00897646, 0.0099721 ]), 'score_time': array([0.06084275, 0.0608356 , 0.05784559, 0.0558517 , 0.05585027,
       0.05539942, 0.0528574 , 0.05186081, 0.06482673, 0.05086398]), 'test_score': array([0.54887218, 0.57330827, 0.54323308, 0.58458647, 0.55263158,
       0.68796992, 0.69303202, 0.63653484, 0.48964218, 0.47269303])}
Execution time: 1.65s

Step 2 [2]: training classifiers in a supervised way (Support Vector Machine algorithm)¶

In [4]:
def svm_classification(input_file, output_file, unit_index, label, C=1.0, kernel='linear', gamma='scale'):
    """
    :param input_file: str, input data file path
    :param output_file: str, output file path
    :param unit_index: str, column name of unit index
    :param label: str, column name of label
    :param C: float, degree of punishment for controlling classification errors
    :param kernel: str, kernel function which can improve the accuracy and generalization ability of the classifier
    :param gamma: {'scale', 'auto'} or float, affects the Gaussian kernel function
    """

    # Read input data file
    df = pd.read_csv(input_file)

    # Separate the data frame of Patient index, PCs columns
    df_features = df.drop([unit_index, label], axis=1)

    # Separate the data frame with Patient index and Label
    df_labels = df[[unit_index, label]]

    # Fit SVM model
    svm = SVC(C=C, kernel=kernel, gamma=gamma)
    svm.fit(df_features, df_labels[label])

    # Predict labels
    df_labels['Predicted Label'] = svm.predict(df_features)

    # Export the new data frame with predicted labels as output file
    df_labels.to_csv(output_file, index=False)

    # Compute the confusion matrix
    tp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 1)).sum()
    tn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 0)).sum()
    fp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 0)).sum()
    fn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 1)).sum()

    # Compute the accuracy, precision, and recall
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1_score = 2 * (precision * recall / (precision + recall))

    # Draw SVM scatter plot with predicted labels
    plt.figure(figsize=(10, 8))
    fig, ax = plt.subplots()
    groups = df_labels.groupby('Predicted Label')
    for name, group in groups:
        ax.scatter(df.loc[group.index, 'PC1'], df.loc[group.index, 'PC2'], label=name)
    ax.legend()
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('SVM Classification')
    plt.show()

    # Compute confusion matrix
    cm = confusion_matrix(df_labels[label], df_labels['Predicted Label'])

    # Plot confusion matrix using seaborn heatmap
    plt.figure(figsize=(5, 5))
    sns.heatmap(cm, annot=True, cmap='Blues')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()

    # print out the accuracy, precision, recall, F1 score of SVM classifier model
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 score:', F1_score)

    # Fit SVM model with cross-validation
    scores = cross_validate(svm, df_features, df_labels[label], cv=10)
    print("Cross validate scores:", scores)


if __name__ == "__main__":
    start_time = time.time()
    svm_classification('PCA10_data.csv', 'new_svm_data.csv', 'Patient index', 'Label', C=1.0, kernel='linear',
                       gamma='scale')
    end_time = time.time()
    print('Execution time: {:.2f}s'.format(end_time - start_time))
C:\Users\Scort\AppData\Local\Temp\ipykernel_38696\1008353979.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labels['Predicted Label'] = svm.predict(df_features)
<Figure size 1000x800 with 0 Axes>
Accuracy: 0.6059066967644846
Precision: 0.5735115431348724
Recall: 0.40393667094565683
F1 score: 0.4740145618880241
Cross validate scores: {'fit_time': array([0.75697565, 0.69201589, 0.7514708 , 0.69665265, 0.72290254,
       0.69580579, 0.57151413, 0.6647296 , 0.68868637, 0.71408963]), 'score_time': array([0.03590345, 0.03194094, 0.03291178, 0.03291154, 0.03291273,
       0.03393769, 0.03490686, 0.03291297, 0.03291202, 0.0428865 ]), 'test_score': array([0.54699248, 0.55451128, 0.53007519, 0.58458647, 0.54135338,
       0.72932331, 0.56120527, 0.72693032, 0.47457627, 0.46892655])}
Execution time: 8.89s

Step 2 [3]: training classifiers in a supervised way (Decision Tree algorithm)¶

In [5]:
def dt_classification(input_file, output_file, unit_index, label, max_depth, min_samples_leaf,
                      min_samples_split,
                      max_features):
    """
        :param input_file: str, input data file path
        :param output_file: str, output file path
        :param unit_index: str, column name of unit index
        :param label: str, column name of label
        :param max_depth, int, maximum depth of the tree
        :param min_samples_leaf, int, minimum number of samples for a leaf node
        :param min_samples_split, int, minimum number of samples with internal nodes
        :param max_features, int, maximum number of features considered at each node split
    """

    # Read input data file
    df = pd.read_csv(input_file)

    # Separate the data frame of Patient index, PCs columns
    df_features = df.drop([unit_index, label], axis=1)

    # Separate the data frame with Patient index and Label
    df_labels = df[[unit_index, label]]

    # Fit DT model
    dt = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=min_samples_leaf,
                                min_samples_split=min_samples_split, max_features=max_features)
    dt.fit(df_features, df_labels[label])

    # Predict labels
    df_labels['Predicted Label'] = dt.predict(df_features)

    # Export the new data frame with predicted labels
    df_labels.to_csv(output_file, index=False)

    # Compute the confusion matrix
    tp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 1)).sum()
    tn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 0)).sum()
    fp = ((df_labels['Predicted Label'] == 1) & (df_labels[label] == 0)).sum()
    fn = ((df_labels['Predicted Label'] == 0) & (df_labels[label] == 1)).sum()

    # Compute the accuracy, precision, and recall
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    F1_score = 2 * (precision * recall / (precision + recall))

    # Draw DT scatter plot with predicted labels
    plt.figure(figsize=(10, 8))
    fig, ax = plt.subplots()
    groups = df_labels.groupby('Predicted Label')
    for name, group in groups:
        ax.scatter(df.loc[group.index, 'PC1'], df.loc[group.index, 'PC2'], label=name)
    ax.legend()
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.title('DT Classification')
    plt.show()

    # Compute confusion matrix
    cm = confusion_matrix(df_labels[label], df_labels['Predicted Label'])

    # Plot confusion matrix using seaborn heatmap
    plt.figure(figsize=(5, 5))
    sns.heatmap(cm, annot=True, cmap='Blues')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()

    # print out the accuracy, precision, recall, F1 score of DT classifier model
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('Recall:', recall)
    print('F1 score:', F1_score)

    # Fit Decision Tree model with cross-validation
    scores = cross_validate(dt, df_features, df_labels[label], cv=10)
    print("Cross validate scores:", scores)


if __name__ == "__main__":
    start_time = time.time()
    dt_classification('PCA10_data.csv', 'new_dt_data.csv', 'Patient index', 'Label', 5, 5, 10, 'sqrt')
    end_time = time.time()
    print('Execution time: {:.2f}s'.format(end_time - start_time))
C:\Users\Scort\AppData\Local\Temp\ipykernel_38696\3175021458.py:30: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labels['Predicted Label'] = dt.predict(df_features)
<Figure size 1000x800 with 0 Axes>
Accuracy: 0.6038374717832957
Precision: 0.608044901777362
Recall: 0.2781343602909713
F1 score: 0.3816793893129771
Cross validate scores: {'fit_time': array([0.00698185, 0.00598431, 0.00698066, 0.00698113, 0.00797892,
       0.0079782 , 0.0079782 , 0.00897646, 0.00797868, 0.00698185]), 'score_time': array([0.00199437, 0.00199485, 0.00199461, 0.00099754, 0.00199461,
       0.00199509, 0.00099707, 0.00199461, 0.00199437, 0.00199485]), 'test_score': array([0.53947368, 0.54511278, 0.54135338, 0.58458647, 0.53571429,
       0.71992481, 0.7740113 , 0.66101695, 0.49340866, 0.46516008])}
Execution time: 0.50s

Step 3: unsupervised classification¶

In [5]: